normed_tmrc <- read.csv("~/Desktop/ElSayedLab/TMRC/tmrc_v202011.csv")
tmrc_metadata <- read.xlsx("~/Desktop/ElSayedLab/TMRC/TMRC_metadata.xlsx", rowNames = TRUE)
#subset to keep only samples with clinical outcome recorded
tmrc_clinicaloutcomesamples <- subset(tmrc_metadata, !is.na(clinicaloutcome) & clinicaloutcome != "lost")
normed_tmrc_clinicaloutcomes <- normed_tmrc[,tolower(rownames(tmrc_clinicaloutcomesamples))]
rownames(normed_tmrc_clinicaloutcomes) <- normed_tmrc$row.names
#subset first timepoint samples
timepoint <- str_sub(tmrc_clinicaloutcomesamples$samplename, -1)
#tmrc_firsttimepointsamples <- tmrc_clinicaloutcomesamples[str_sub(tmrc_clinicaloutcomesamples$samplename, -1) == 1,]
CellType <- tmrc_clinicaloutcomesamples$typeofcellsPCA_normedtmrc <- prcomp(normed_tmrc_clinicaloutcomes)
ggplot(as.data.frame(PCA_normedtmrc$rotation), aes(x = PC1, y = PC2, col = timepoint)) +
geom_point() +
theme_classic() +
geom_text(aes(label=rownames(PCA_normedtmrc$rotation)),hjust=0, vjust=0) +
xlim(c(0.08, .32))ggplot(as.data.frame(PCA_normedtmrc$rotation), aes(x = PC1, y = PC2, col = CellType)) +
geom_point() +
theme_classic() +
geom_text(aes(label=rownames(PCA_normedtmrc$rotation)),hjust=0, vjust=0) +
xlim(c(0.08, .32)) Points don’t offer a lot of variation based on timepoint, but do based on cell type. We’ll take this non-separation by timepoint in the PCA as evidence that we can (for now) use all timepoints just to see what kind of predictive power we have with more samples.
#create a list of random number ranging from 1 to number of rows from actual data and 70% of the data into training data
set.seed(123)
train_indices <- sort(sample(nrow(t(normed_tmrc_clinicaloutcomes)), nrow(t(normed_tmrc_clinicaloutcomes))*.9))
#creating training data set by selecting the output row values
train <- t(normed_tmrc_clinicaloutcomes)[train_indices,]
#creating test data set by not selecting the output row values
test <- t(normed_tmrc_clinicaloutcomes)[-train_indices,]
target_train <- tmrc_clinicaloutcomesamples$clinicaloutcome[train_indices]
target_train_binary <- ifelse(target_train =="cure", 1, 0)
target_test <- tmrc_clinicaloutcomesamples$clinicaloutcome[-train_indices]#subset highly variable genes
var_genes <- apply(t(train), 1, var)
select_var <- names(sort(var_genes, decreasing=TRUE))[1:500]
topvar <- head(select_var)
#stats <- scran::modelGeneVar(as.matrix(t(train)))
#var.features <- scran::getTopHVGs(stats, n = 10)
train_varfeatures <- train[,topvar]
celltype <- tmrc_clinicaloutcomesamples[train_indices,]$typeofcells
timepoint2 <- timepoint[train_indices]
par(mfrow = c(1, 3))
boxplot(train_varfeatures[,1] ~ celltype)
boxplot(train_varfeatures[,2] ~ celltype)
boxplot(train_varfeatures[,3] ~ celltype)par(mfrow = c(1, 3))
boxplot(train_varfeatures[,1] ~ target_train)
boxplot(train_varfeatures[,2] ~ target_train)
boxplot(train_varfeatures[,3] ~ target_train) As expected based on the PCA plot, all the HVG’s give us are genes which are variable wrt cell type and not cure/fail, so these aren’t going to help us predict outcome. We will do DE for the condition outcome to get a list of genes to use in the model.
#read in raw reads
rawreads <- read.xlsx("~/Desktop/ElSayedLab/TMRC/tmrc_rawreads.xlsx", rowNames = TRUE)
rawreads_train <- rawreads[,rownames(train)]
coldata <- data.frame(row.names = rownames(train), outcome = as.factor(target_train))
dds <- DESeqDataSetFromMatrix(countData = rawreads_train,
colData = coldata,
design = ~ outcome)## converting counts to integer mode
dds$outcome <- relevel(dds$outcome, ref = "failure")
dds <- dds[rownames(normed_tmrc_clinicaloutcomes),]
#filter lowly expressed genes
keep <- rowSums(counts(dds)) >= 10
dds <- dds[keep,]
dds <- DESeq(dds)## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 1086 genes
## -- DESeq argument 'minReplicatesForReplace' = 7
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
## There are 1905 genes with adjusted p-value < .05 and log2FC > 1
res05 <- results(dds, alpha=0.05)
top25DEgenes_up <- rownames(res05[order(res05$log2FoldChange, decreasing = TRUE),][1:25,])
top25DEgenes_down <- rownames(res05[order(res05$log2FoldChange),][1:25,])
top50DEgenes_updown <- rownames(res05[order(abs(res05$log2FoldChange), decreasing = TRUE),][1:50,])par(mfrow = c(1, 3))
boxplot(train[,top50DEgenes_updown[1]] ~ target_train, ylab = top50DEgenes_updown[1], xlab = "")
boxplot(train[,top50DEgenes_updown[2]] ~ target_train, ylab = top50DEgenes_updown[2], xlab = "")
boxplot(train[,top50DEgenes_updown[3]] ~ target_train, ylab = top50DEgenes_updown[3], xlab = "")
par(mfrow = c(1, 3))
boxplot(train[,top25DEgenes_up[1]] ~ target_train, ylab = top25DEgenes_up[1], xlab = "")
boxplot(train[,top25DEgenes_up[2]] ~ target_train, ylab = top25DEgenes_up[2], xlab = "")
boxplot(train[,top25DEgenes_up[3]] ~ target_train, ylab = top25DEgenes_up[3], xlab = "")par(mfrow = c(1, 3))
boxplot(train[,top25DEgenes_down[1]] ~ target_train, ylab = top25DEgenes_down[1], xlab = "")
boxplot(train[,top25DEgenes_down[2]] ~ target_train, ylab = top25DEgenes_down[2], xlab = "")
boxplot(train[,top25DEgenes_down[3]] ~ target_train, ylab = top25DEgenes_down[3], xlab = "")## There are 9 down regulated genes and 41 upregulated genes in the top 50 DE genes.
##
## I am not sure why the top Positive FC genes have such outliers. This doesn't seem right to me so something to talk about on Wednesday. For now, I'll just use the top pos DE genes.
train_DEfeatures <- as.data.frame(train[,top25DEgenes_down[1:25]])
#make the models using multinom
fit_1var <- multinom(target_train_binary ~ train[,top25DEgenes_down[1]], data = data.frame(train[,top25DEgenes_down[1]]), family = binomial)## # weights: 3 (2 variable)
## initial value 19.408121
## iter 10 value 9.447906
## final value 9.447464
## converged
## # weights: 3 (2 variable)
## initial value 19.408121
## iter 10 value 10.288517
## final value 10.288469
## converged
## # weights: 3 (2 variable)
## initial value 19.408121
## iter 10 value 11.521634
## final value 11.521610
## converged
## Call:
## multinom(formula = target_train_binary ~ train[, top25DEgenes_down[1]],
## data = data.frame(train[, top25DEgenes_down[1]]), family = binomial)
##
## Coefficients:
## Values Std. Err.
## (Intercept) 4.010759 1.6658758
## train[, top25DEgenes_down[1]] -1.205467 0.6518964
##
## Residual Deviance: 18.89493
## AIC: 22.89493
## Call:
## multinom(formula = target_train_binary ~ train[, top25DEgenes_down[2]],
## data = data.frame(train[, top25DEgenes_down[2]]), family = binomial)
##
## Coefficients:
## Values Std. Err.
## (Intercept) 3.3149276 1.1577407
## train[, top25DEgenes_down[2]] -0.9931441 0.4413342
##
## Residual Deviance: 20.57694
## AIC: 24.57694
## Call:
## multinom(formula = target_train_binary ~ train[, top25DEgenes_down[3]],
## data = data.frame(train[, top25DEgenes_down[3]]), family = binomial)
##
## Coefficients:
## Values Std. Err.
## (Intercept) 4.468354 1.6234122
## train[, top25DEgenes_down[3]] -1.706104 0.6878339
##
## Residual Deviance: 23.04322
## AIC: 27.04322
# prediction for each sample
predictions_1var <- predict(fit_1var, as.data.frame(train_varfeatures)[,1:3])
predictions_2var <- predict(fit_2vars, as.data.frame(train_varfeatures)[,1:3])
predictions_3var <- predict(fit_3vars, as.data.frame(train_varfeatures)[,1:3])##
## Not bad! Of our 28 samples, only 2 were misclassified as cure when they were actually failure in the first model, and 3 misclassified in the 2nd and 3rd. My guess is we would rather false negatives (were actually cure when we predict they would be failure to cure). Is this correct?
##
## They're all pretty highly correlated and redundant. Let's do stepwise regression to make sure it agrees that these are redundant.
#full model
fit_full <- glm(target_train_binary ~ ENSG00000282278 + ENSG00000273734 + ENSG00000266086 + ENSG00000268797, family=binomial, data= train_DEfeatures)
#backward stepwise regression (suppressing output with trace = 0)
backwards <- step(fit_full, trace = 0) ## The optimal model using backward stepwise regression with the first top 4 DE genes is:
## target_train_binary ~ ENSG00000282278
##
## with AIC =
## [1] 22.89493
fit_0 <- glm(target_train_binary ~ 1, family=binomial, data = train_DEfeatures)
forwards = step(fit_0, scope=list(lower=formula(fit_0), upper=formula(fit_full)), direction="forward", trace = 0)## The optimal model using backward stepwise regression with the first top 4 DE genes is:
## target_train_binary ~ ENSG00000282278
##
## with AIC =
## [1] 22.89493
##
## So the forwards and backwards stepwise models agree that of the top 4 DE genes, ENSG00000204577 + ENSG00000282278 should be included in the model.
## The formula using both ways stepwise regression finds the same model as well:
##
## target_train_binary ~ ENSG00000282278
##
## with AIC =
## [1] 22.89493
set.seed(11)
randomgenes <- sample(1:25, 4)
randomgenessubset <- as.data.frame(train_DEfeatures[,randomgenes])
fit_random <- glm(target_train_binary ~ randomgenessubset[,1] + randomgenessubset[,2] + randomgenessubset[,3] + randomgenessubset[,4], family=binomial, data= randomgenessubset)
#backward stepwise regression (suppressing output with trace = 0)
random <- step(fit_random, trace = 0) ## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## The optimal model using random 4 genes and backwards stepwise regression is:
## target_train_binary ~ randomgenessubset[, 1] + randomgenessubset[,
## 3]
##
## with AIC =
## [1] 20.01402
## So even with a random set of 4 genes from the top DE gene list, the AIC is comparable (and even a little bit better!), meaning the model is still pretty comparable. So are the top DE genes really helping us make a better model or would any random subset of the top DE genes do just as well?
I created an AIC value for 100 random draws of 3 genes from the top 25 gene’s list. This will tell us if what we got from the above models (with an AIC value of ~36) generally performs
AICs <- c()
models <- data.frame(row.names = c("gene1", "gene2", "gene3", "gene4"))
for (i in 1:100){
randomgenes <- sample(1:25, 4)
randomgenessubset <- as.data.frame(train_DEfeatures[,randomgenes])
fit_random <- glm(target_train_binary ~ randomgenessubset[,1] + randomgenessubset[,2] + randomgenessubset[,3],
family=binomial,
data= randomgenessubset)
#backward stepwise regression (suppressing output with trace = 0)
random <- step(fit_random, trace = 0)
models <- cbind(models, randomgenes)
AICs <- c(AICs, random$aic)
}
d <- density(AICs)
plot(d, main="Kernel Density of bootstrapped AICs")
polygon(d, col="red", border="red")## Hmmm, so this isn't great. This is showing us that if we take random subsets of genes from the top 25 genes, the mean AIC for those models is 20.17 , so our models using only the top 3 DE genes really isn't doing any better than a model taking a random subset of 3 genes in the top 25 DE genes. Let's check how highly correlated our top DE genes are. If the top DE genes are highly correlated across samples, this may tell us why our model is only as good as the mean of 100 randomly generated models
## We see a lot of highly correlated gene pairs in the top 25 DE genes, so I guess our model's AIC score is not that surprising since there is a high change that 3 randomly chosen genes have some correlation to the 3 top DE genes we made our initial model with. I think this is overall good news because we may have many gene candidates that end up being good predictors to choose from for a potential assay.
#train LDA classifier
train_LDA <- lda(train_DEfeatures, target_train)
LDApredictions <- predict(train_LDA, train_DEfeatures)
LDApredictions_df <- data.frame(LDAweights = LDApredictions$x, Prediction = LDApredictions$class, True = as.factor(target_train))
#extract genes which have the highest magnitude weights (similar to if you were going to look at the weights of the PC's)
topLDA_features <- head(train_LDA$scaling[order(abs(train_LDA$scaling), decreasing = TRUE),], 11)
plot(abs(train_LDA$scaling[order(abs(train_LDA$scaling), decreasing = TRUE),]), ylab = "Magnitude of LD Weights", main = "Magnitude of LD Weights for Each Gene")
abline(v = 11.5)## This gives us an indication of which genes are important in helping to separate the cures from the fail to cures, Each point is the weight given to a gene in how 'important' or discfriminatory that gene is in the LDA embedding. You could think of this kind of as an elbow plot that you might use to determine how many PC's to use which captures a majority of the variation. From this plot, I would say we should look at the top 11 genes here, which is where the weights start to level off.
#pull out expression data for these top genes
LDAgenessubset <- as.data.frame(train_DEfeatures[,names(topLDA_features)])
#logit model using these genes
fit_LDAvars <- glm(target_train_binary ~ ENSG00000281039 + ENSG00000259371 + ENSG00000268797 + ENSG00000282804 + ENSG00000272410 + ENSG00000283977 + ENSG00000173366 + ENSG00000266086, family=binomial, data= LDAgenessubset)## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
#backward stepwise regression (suppressing output with trace = 0)
LDAfit <- step(fit_LDAvars, trace = 0) ## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## The optimal model using the top 6 genes from the LDA weights and backwards stepwise regression is:
## target_train_binary ~ ENSG00000259371 + ENSG00000272410 + ENSG00000173366
##
## with AIC =
## [1] 19.51135
##
## These two genes are numbers 20 , 10 and 13 in the DE gene order.
##
## So using the top LDA genes to build a logit model is a bit better than what we got using the just the top DE genes and is better than the mean AIC of the random subset of top DE genes. So far, this method of feature selection is the most optimal.
##
## The LDA model is 100% accurate in the predictions of pass/fail (although this is kind of obvious since we used the same samples to make predictions as we did to build the model, although I did the same for the logit model and we still didn't get 100% accuracy.
## )
## Warning: `arrange_()` is deprecated as of dplyr 0.7.0.
## Please use `arrange()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
##
## These are the first 3 top weighted LD genes. We can see they do a pretty good job of separating out